library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)
library(glmnet)
library(randomForestSRC)
library(caret)AAQoL machine learning analysis with unbalanced random forest
Data set
This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.
Input data set
qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |>
mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
`English Speaking`=relevel(`English Speaking`,ref="Not at all"),
Ethnicity = relevel(Ethnicity,ref="Chinese")) |>
mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
"$10,000 - $19,999" ~"Below",
"$20,000 - $29,999"~"Below",
"$30,000 - $39,999"~"Below",
"$40,000 - $49,999"~"Below",
"$50,000 - $59,999"~"Below",
"$60,000 - $69,999"~"Above",
"$70,000 and over"~"Above",
.default=Income)) |>
mutate(Income_median = factor(Income_median, levels=c("Below","Above")))New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
Source of Information: Family
ps(Family)# A tibble: 4 × 3
Family n pct
<fct> <int> <dbl>
1 3 1 0.0383
2 No 1258 48.2
3 Yes 1331 51.0
4 <NA> 19 0.728
rfdata <- qol |> filter(Family %in% c("No","Yes")) |>
mutate(Family=droplevels(Family)) |>
select(Family, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
# filter(!is.na(Family)) |>
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Family ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(rfobj) Sample size: 2043
Frequency of class labels: 991, 1052
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 576.0193
No. of variables tried at each split: 4
Total no. of variables: 11
Resampling used to grow trees: swor
Resample size used to grow trees: 1291
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0616
(OOB) Brier score: 0.24879713
(OOB) Normalized Brier score: 0.99518853
(OOB) AUC: 0.6268316
(OOB) PR-AUC: 0.5798947
(OOB) G-mean: 0.58322112
(OOB) Requested performance error: 0.41677888
Confusion matrix:
predicted
observed No Yes class.error
No 593 398 0.4016
Yes 454 598 0.4316
(OOB) Misclassification rate: 0.4170338
print(rfobj) Sample size: 2043
Frequency of class labels: 991, 1052
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 576.0193
No. of variables tried at each split: 4
Total no. of variables: 11
Resampling used to grow trees: swor
Resample size used to grow trees: 1291
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0616
(OOB) Brier score: 0.24879713
(OOB) Normalized Brier score: 0.99518853
(OOB) AUC: 0.6268316
(OOB) PR-AUC: 0.5798947
(OOB) G-mean: 0.58322112
(OOB) Requested performance error: 0.41677888
Confusion matrix:
predicted
observed No Yes class.error
No 593 398 0.4016
Yes 454 598 0.4316
(OOB) Misclassification rate: 0.4170338
plot(rfobj,plots.one.page = FALSE)

all No Yes
Age 0.0309 NA NA
Health.Insurance 0.0069 NA NA
Ethnicity 0.0061 NA NA
EnglishSpeak 0.0049 NA NA
Discrimination 0.0006 NA NA
Dental.Insurance -0.0022 NA NA
Employment -0.0029 NA NA
Income_median -0.0059 NA NA
Gender -0.0063 NA NA
EnglishDiff -0.0071 NA NA
Religion -0.0191 NA NA
rfobj$importance all No Yes
Ethnicity 0.0060708045 NA NA
Age 0.0308768777 NA NA
Gender -0.0063052764 NA NA
Religion -0.0190960623 NA NA
Employment -0.0029185559 NA NA
Income_median -0.0058812711 NA NA
EnglishSpeak 0.0048920137 NA NA
EnglishDiff -0.0071158157 NA NA
Health.Insurance 0.0068973926 NA NA
Dental.Insurance -0.0021882329 NA NA
Discrimination 0.0006458885 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
pos<- rfdata |> filter(Family=="Yes")
neg <- rfdata |> filter(Family=="No")
set.seed(222)
imbal_index <- caret::createDataPartition(rfdata$Family,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Family~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=train, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Family ~ .,importance=T,data=train,
perf.type = "gmean",splitrule="gini")
print(rfobj) Sample size: 1635
Frequency of class labels: 803, 832
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 360.381
No. of variables tried at each split: 4
Total no. of variables: 11
Resampling used to grow trees: swor
Resample size used to grow trees: 1033
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0361
(OOB) Brier score: 0.18045444
(OOB) Normalized Brier score: 0.72181774
(OOB) AUC: 0.80763094
(OOB) PR-AUC: 0.79597193
(OOB) G-mean: 0.73795692
(OOB) Requested performance error: 0.26204308
Confusion matrix:
predicted
observed Yes No class.error
Yes 584 219 0.2727
No 209 623 0.2512
(OOB) Misclassification rate: 0.2617737
plot(rfobj,plots.one.page = FALSE)

all Yes No
EnglishDiff 0.0719 NA NA
Ethnicity 0.0515 NA NA
Employment 0.0498 NA NA
Religion 0.0412 NA NA
EnglishSpeak 0.0353 NA NA
Dental.Insurance 0.0296 NA NA
Age 0.0249 NA NA
Income_median 0.0184 NA NA
Gender 0.0167 NA NA
Health.Insurance 0.0128 NA NA
Discrimination 0.0076 NA NA
rfobj$importance all Yes No
Ethnicity 0.051456689 NA NA
Age 0.024930992 NA NA
Gender 0.016728281 NA NA
Religion 0.041167460 NA NA
Employment 0.049775540 NA NA
Income_median 0.018418546 NA NA
EnglishSpeak 0.035258321 NA NA
EnglishDiff 0.071887244 NA NA
Health.Insurance 0.012794833 NA NA
Dental.Insurance 0.029610793 NA NA
Discrimination 0.007621102 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance = T)
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
210.0000000 198.0000000 1.0606061 0.4852941 0.6010101 0.5047619
prec npv misclass brier brier.norm auc
0.5336323 0.5729730 0.4485294 0.2510778 1.0043113 0.6057961
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.5653207 0.5506433 0.4852941 0.5777928 0.5580541 0.5507155
gmean
0.5507876
test_rf$importance all Yes No
Ethnicity 0.006328172 NA NA
Age 0.019448825 NA NA
Gender 0.006508259 NA NA
Religion -0.004444357 NA NA
Employment 0.004840415 NA NA
Income_median 0.002827600 NA NA
EnglishSpeak 0.010967159 NA NA
EnglishDiff -0.003174699 NA NA
Health.Insurance 0.004588388 NA NA
Dental.Insurance 0.004718407 NA NA
Discrimination 0.008131573 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
Source of Information: Health Professionals
ps(`Heal Professionals`)# A tibble: 3 × 3
`Heal Professionals` n pct
<fct> <int> <dbl>
1 No 1326 50.8
2 Yes 1264 48.4
3 <NA> 19 0.728
rfdata <- qol |>
select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imbalanced(Heal.Professionals ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")->rfobj
print(rfobj) Sample size: 2044
Frequency of class labels: 990, 1054
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 570.5297
No. of variables tried at each split: 4
Total no. of variables: 11
Resampling used to grow trees: swor
Resample size used to grow trees: 1292
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0646
(OOB) Brier score: 0.25053189
(OOB) Normalized Brier score: 1.00212757
(OOB) AUC: 0.62961877
(OOB) PR-AUC: 0.5837091
(OOB) G-mean: 0.59764134
(OOB) Requested performance error: 0.40235866
Confusion matrix:
predicted
observed No Yes class.error
No 607 383 0.3869
Yes 440 614 0.4175
(OOB) Misclassification rate: 0.4026419
plot(rfobj,plots.one.page = FALSE)

all No Yes
EnglishSpeak 0.0180 NA NA
Discrimination 0.0124 NA NA
Dental.Insurance 0.0113 NA NA
Health.Insurance 0.0039 NA NA
EnglishDiff 0.0035 NA NA
Income_median 0.0015 NA NA
Gender -0.0052 NA NA
Ethnicity -0.0069 NA NA
Religion -0.0099 NA NA
Employment -0.0112 NA NA
Age -0.0130 NA NA
rfobj$importance all No Yes
Ethnicity -0.006852786 NA NA
Age -0.013024851 NA NA
Gender -0.005190613 NA NA
Religion -0.009945867 NA NA
Employment -0.011239457 NA NA
Income_median 0.001458609 NA NA
EnglishSpeak 0.017958624 NA NA
EnglishDiff 0.003497160 NA NA
Health.Insurance 0.003911037 NA NA
Dental.Insurance 0.011261219 NA NA
Discrimination 0.012365445 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
pos<- rfdata |> filter(Heal.Professionals=="Yes")
neg <- rfdata |> filter(Heal.Professionals==0)
set.seed(222)
imbal_index <- createDataPartition(rfdata$Heal.Professionals,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Heal.Professionals~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Heal.Professionals ~ .,importance=T,data=train,
perf.type = "gmean",splitrule="gini")
print(rfobj) Sample size: 1636
Frequency of class labels: 804, 832
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 341.4653
No. of variables tried at each split: 4
Total no. of variables: 11
Resampling used to grow trees: swor
Resample size used to grow trees: 1034
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0348
(OOB) Brier score: 0.157625
(OOB) Normalized Brier score: 0.6305
(OOB) AUC: 0.85729107
(OOB) PR-AUC: 0.84956315
(OOB) G-mean: 0.78549864
(OOB) Requested performance error: 0.21450136
Confusion matrix:
predicted
observed Yes No class.error
Yes 634 170 0.2114
No 181 651 0.2175
(OOB) Misclassification rate: 0.2145477
print(rfobj) Sample size: 1636
Frequency of class labels: 804, 832
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 341.4653
No. of variables tried at each split: 4
Total no. of variables: 11
Resampling used to grow trees: swor
Resample size used to grow trees: 1034
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0348
(OOB) Brier score: 0.157625
(OOB) Normalized Brier score: 0.6305
(OOB) AUC: 0.85729107
(OOB) PR-AUC: 0.84956315
(OOB) G-mean: 0.78549864
(OOB) Requested performance error: 0.21450136
Confusion matrix:
predicted
observed Yes No class.error
Yes 634 170 0.2114
No 181 651 0.2175
(OOB) Misclassification rate: 0.2145477
plot(rfobj,plots.one.page = FALSE)

all Yes No
Religion 0.0674 NA NA
Ethnicity 0.0654 NA NA
EnglishSpeak 0.0537 NA NA
Age 0.0464 NA NA
EnglishDiff 0.0312 NA NA
Income_median 0.0296 NA NA
Discrimination 0.0281 NA NA
Dental.Insurance 0.0269 NA NA
Gender 0.0220 NA NA
Health.Insurance 0.0201 NA NA
Employment 0.0162 NA NA
rfobj$importance all Yes No
Ethnicity 0.06541069 NA NA
Age 0.04640521 NA NA
Gender 0.02195136 NA NA
Religion 0.06738251 NA NA
Employment 0.01623423 NA NA
Income_median 0.02957817 NA NA
EnglishSpeak 0.05370562 NA NA
EnglishDiff 0.03115796 NA NA
Health.Insurance 0.02010926 NA NA
Dental.Insurance 0.02685986 NA NA
Discrimination 0.02805890 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
210.0000000 198.0000000 1.0606061 0.4852941 0.5656566 0.5285714
prec npv misclass brier brier.norm auc
0.5308057 0.5634518 0.4534314 0.2698256 1.0793025 0.5750722
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.5476773 0.5465636 0.4852941 0.5507492 0.5472385 0.5466817
gmean
0.5467997
test_rf$importance all Yes No
Ethnicity -0.0221623422 NA NA
Age 0.0004597253 NA NA
Gender -0.0031441717 NA NA
Religion -0.0018772342 NA NA
Employment -0.0100570837 NA NA
Income_median -0.0023044699 NA NA
EnglishSpeak 0.0036238669 NA NA
EnglishDiff -0.0002706752 NA NA
Health.Insurance 0.0024200857 NA NA
Dental.Insurance 0.0060652123 NA NA
Discrimination -0.0004590801 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
Health Insurance
ps(`Health Insurance`)# A tibble: 3 × 3
`Health Insurance` n pct
<fct> <int> <dbl>
1 0 381 14.6
2 Yes 2207 84.6
3 <NA> 21 0.805
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Health Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Health.Insurance ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(imb) Sample size: 1936
Frequency of class labels: 259, 1677
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 232.2483
No. of variables tried at each split: 6
Total no. of variables: 32
Resampling used to grow trees: swor
Resample size used to grow trees: 1224
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 6.4749
(OOB) Brier score: 0.10507578
(OOB) Normalized Brier score: 0.42030312
(OOB) AUC: 0.73686349
(OOB) PR-AUC: 0.31633622
(OOB) G-mean: 0.66187789
(OOB) Requested performance error: 0.33812211
Confusion matrix:
predicted
observed 0 Yes class.error
0 198 61 0.2355
Yes 716 961 0.4270
(OOB) Misclassification rate: 0.401343
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1677.0000000 259.0000000 6.4749035 0.1337810 0.7644788 0.5730471
prec npv misclass brier brier.norm auc
0.2166302 0.9403131 0.4013430 0.1050758 0.4203031 0.7368635
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.3375959 0.4580445 0.1337810 0.3163362 0.4997369 0.5599612
gmean
0.6618779
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
# ind_pos <- sample(c(0,1), nrow(pos), replace = T, prob = c(0.7, 0.3))
# ind_neg <- sample(c(0,1), nrow(neg), replace = T, prob = c(0.7, 0.3))
#
#
# train <- bind_rows(pos[ind_pos==0,],neg[ind_neg==0,])
# test <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
imbal_index <- createDataPartition(rfdata$Health.Insurance,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Health.Insurance~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Health.Insurance ~ .,importance=T,data=train,
perf.type = "gmean",splitrule="gini")
print(rfobj) Sample size: 1550
Frequency of class labels: 760, 790
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 206.605
No. of variables tried at each split: 6
Total no. of variables: 32
Resampling used to grow trees: swor
Resample size used to grow trees: 980
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0395
(OOB) Brier score: 0.07716915
(OOB) Normalized Brier score: 0.30867661
(OOB) AUC: 0.98931046
(OOB) PR-AUC: 0.98844527
(OOB) G-mean: 0.93960883
(OOB) Requested performance error: 0.06039117
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 692 68 0.0895
0 24 766 0.0304
(OOB) Misclassification rate: 0.05935484
print(rfobj) Sample size: 1550
Frequency of class labels: 760, 790
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 206.605
No. of variables tried at each split: 6
Total no. of variables: 32
Resampling used to grow trees: swor
Resample size used to grow trees: 980
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0395
(OOB) Brier score: 0.07716915
(OOB) Normalized Brier score: 0.30867661
(OOB) AUC: 0.98931046
(OOB) PR-AUC: 0.98844527
(OOB) G-mean: 0.93960883
(OOB) Requested performance error: 0.06039117
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 692 68 0.0895
0 24 766 0.0304
(OOB) Misclassification rate: 0.05935484
plot(rfobj,plots.one.page = FALSE)

all Yes 0
Religion 0.0111 NA NA
EnglishDiff 0.0100 NA NA
EnglishSpeak 0.0088 NA NA
Ethnicity 0.0085 NA NA
Community.Shares.Values 0.0076 NA NA
Income_median 0.0068 NA NA
Helpful.Friends 0.0051 NA NA
Community.Trust 0.0049 NA NA
Close.Friends 0.0044 NA NA
Helpful.Family 0.0039 NA NA
Successful.Family 0.0037 NA NA
Religious.Importance 0.0037 NA NA
Close.Family 0.0036 NA NA
Get.Along 0.0035 NA NA
Close.knit.Community 0.0023 NA NA
Togetherness 0.0018 NA NA
Similar.Values 0.0018 NA NA
Helpful.Community 0.0013 NA NA
Family.Pride 0.0011 NA NA
Family.Respect 0.0011 NA NA
See.Friends 0.0010 NA NA
See.Family 0.0010 NA NA
Discrimination 0.0006 NA NA
Gender 0.0005 NA NA
Employment 0.0004 NA NA
Spend.Time.Together 0.0004 NA NA
rfobj$importance all Yes 0
Ethnicity 0.0084731502 NA NA
Age -0.0013568366 NA NA
Gender 0.0004875932 NA NA
Religion 0.0110560906 NA NA
Employment 0.0004326155 NA NA
Income_median 0.0068200365 NA NA
EnglishSpeak 0.0088425976 NA NA
EnglishDiff 0.0099724240 NA NA
See.Family 0.0009860864 NA NA
Close.Family 0.0036275503 NA NA
Helpful.Family 0.0038767101 NA NA
See.Friends 0.0009860864 NA NA
Close.Friends 0.0044259859 NA NA
Helpful.Friends 0.0051039003 NA NA
Family.Respect 0.0011032086 NA NA
Similar.Values 0.0017796091 NA NA
Successful.Family 0.0037485625 NA NA
Trust -0.0008646339 NA NA
Loyalty -0.0015400189 NA NA
Family.Pride 0.0011032086 NA NA
Expression 0.0003687744 NA NA
Spend.Time.Together 0.0004272952 NA NA
Feel.Close -0.0001887636 NA NA
Togetherness 0.0017796091 NA NA
Religious.Attendance -0.0011390952 NA NA
Religious.Importance 0.0036871646 NA NA
Close.knit.Community 0.0022779010 NA NA
Helpful.Community 0.0013463726 NA NA
Community.Shares.Values 0.0076247813 NA NA
Get.Along 0.0034594049 NA NA
Community.Trust 0.0049194518 NA NA
Discrimination 0.0006135220 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
335.0000000 51.0000000 6.5686275 0.1321244 0.9607843 0.1283582
prec npv misclass brier brier.norm auc
0.1436950 0.9555556 0.7616580 0.1669358 0.6677432 0.6633597
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.2500000 0.2375691 0.1321244 0.2230538 0.3005880 0.2943725
gmean
0.3511760
test_rf$importance all Yes 0
Ethnicity 0.0185811541 NA NA
Age 0.0046936485 NA NA
Gender -0.0049165067 NA NA
Religion 0.0049522296 NA NA
Employment 0.0249069777 NA NA
Income_median 0.0384678821 NA NA
EnglishSpeak 0.0477676308 NA NA
EnglishDiff 0.0068514128 NA NA
See.Family -0.0026200259 NA NA
Close.Family 0.0014578439 NA NA
Helpful.Family -0.0090863643 NA NA
See.Friends -0.0033656426 NA NA
Close.Friends -0.0005776834 NA NA
Helpful.Friends -0.0001471166 NA NA
Family.Respect 0.0026461124 NA NA
Similar.Values 0.0004857335 NA NA
Successful.Family 0.0010069939 NA NA
Trust -0.0003749214 NA NA
Loyalty 0.0021279686 NA NA
Family.Pride 0.0042392685 NA NA
Expression 0.0035005613 NA NA
Spend.Time.Together 0.0004593727 NA NA
Feel.Close 0.0011686196 NA NA
Togetherness 0.0141334482 NA NA
Religious.Attendance 0.0138012802 NA NA
Religious.Importance 0.0135173515 NA NA
Close.knit.Community 0.0017685922 NA NA
Helpful.Community 0.0146127717 NA NA
Community.Shares.Values 0.0049706964 NA NA
Get.Along 0.0117843509 NA NA
Community.Trust 0.0121515197 NA NA
Discrimination -0.0003562717 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
Dental Insurance
ps(`Dental Insurance`)# A tibble: 3 × 3
`Dental Insurance` n pct
<fct> <int> <dbl>
1 0 1050 40.2
2 Yes 1529 58.6
3 <NA> 30 1.15
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Dental Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Dental.Insurance ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(imb) Sample size: 1932
Frequency of class labels: 760, 1172
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 353.968
No. of variables tried at each split: 6
Total no. of variables: 32
Resampling used to grow trees: swor
Resample size used to grow trees: 1221
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.5421
(OOB) Brier score: 0.17800194
(OOB) Normalized Brier score: 0.71200778
(OOB) AUC: 0.79767323
(OOB) PR-AUC: 0.71118357
(OOB) G-mean: 0.73529229
(OOB) Requested performance error: 0.26470771
Confusion matrix:
predicted
observed 0 Yes class.error
0 588 172 0.2263
Yes 353 819 0.3012
(OOB) Misclassification rate: 0.2717391
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1172.0000000 760.0000000 1.5421053 0.3933747 0.7736842 0.6988055
prec npv misclass brier brier.norm auc
0.6248672 0.8264379 0.2717391 0.1780019 0.7120078 0.7976732
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.6913580 0.7228198 0.3933747 0.7111836 0.7133252 0.7290560
gmean
0.7352923
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Dental.Insurance,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Dental.Insurance~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Dental.Insurance ~ .,importance=T,data=train,
perf.type = "gmean",splitrule="gini")
print(rfobj) Sample size: 1546
Frequency of class labels: 757, 789
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 227.409
No. of variables tried at each split: 6
Total no. of variables: 32
Resampling used to grow trees: swor
Resample size used to grow trees: 977
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0423
(OOB) Brier score: 0.11458659
(OOB) Normalized Brier score: 0.45834637
(OOB) AUC: 0.9374595
(OOB) PR-AUC: 0.93163041
(OOB) G-mean: 0.86901707
(OOB) Requested performance error: 0.13098293
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 649 108 0.1427
0 94 695 0.1191
(OOB) Misclassification rate: 0.1306598
print(rfobj) Sample size: 1546
Frequency of class labels: 757, 789
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 227.409
No. of variables tried at each split: 6
Total no. of variables: 32
Resampling used to grow trees: swor
Resample size used to grow trees: 977
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0423
(OOB) Brier score: 0.11458659
(OOB) Normalized Brier score: 0.45834637
(OOB) AUC: 0.9374595
(OOB) PR-AUC: 0.93163041
(OOB) G-mean: 0.86901707
(OOB) Requested performance error: 0.13098293
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 649 108 0.1427
0 94 695 0.1191
(OOB) Misclassification rate: 0.1306598
plot(rfobj,plots.one.page = FALSE)

all Yes 0
Income_median 0.0322 NA NA
Employment 0.0277 NA NA
EnglishSpeak 0.0256 NA NA
Religion 0.0197 NA NA
EnglishDiff 0.0185 NA NA
Ethnicity 0.0168 NA NA
Religious.Attendance 0.0111 NA NA
Age 0.0092 NA NA
Religious.Importance 0.0086 NA NA
Community.Shares.Values 0.0086 NA NA
Close.knit.Community 0.0085 NA NA
Helpful.Family 0.0075 NA NA
Get.Along 0.0074 NA NA
Expression 0.0071 NA NA
Successful.Family 0.0066 NA NA
Trust 0.0059 NA NA
Helpful.Community 0.0053 NA NA
Loyalty 0.0053 NA NA
Similar.Values 0.0052 NA NA
Spend.Time.Together 0.0052 NA NA
Community.Trust 0.0046 NA NA
Family.Respect 0.0046 NA NA
Close.Friends 0.0033 NA NA
Togetherness 0.0033 NA NA
Family.Pride 0.0033 NA NA
Helpful.Friends 0.0033 NA NA
rfobj$importance all Yes 0
Ethnicity 0.0168312534 NA NA
Age 0.0092060454 NA NA
Gender 0.0001416190 NA NA
Religion 0.0197281929 NA NA
Employment 0.0276506891 NA NA
Income_median 0.0322000726 NA NA
EnglishSpeak 0.0256452206 NA NA
EnglishDiff 0.0184910635 NA NA
See.Family 0.0026821579 NA NA
Close.Family 0.0026821579 NA NA
Helpful.Family 0.0074523783 NA NA
See.Friends 0.0008143837 NA NA
Close.Friends 0.0033056447 NA NA
Helpful.Friends 0.0032592303 NA NA
Family.Respect 0.0045539676 NA NA
Similar.Values 0.0052243540 NA NA
Successful.Family 0.0065666900 NA NA
Trust 0.0059437572 NA NA
Loyalty 0.0053212736 NA NA
Family.Pride 0.0032592303 NA NA
Expression 0.0070987707 NA NA
Spend.Time.Together 0.0051788055 NA NA
Feel.Close 0.0020108405 NA NA
Togetherness 0.0032592303 NA NA
Religious.Attendance 0.0110745251 NA NA
Religious.Importance 0.0086366602 NA NA
Close.knit.Community 0.0085335299 NA NA
Helpful.Community 0.0053212736 NA NA
Community.Shares.Values 0.0085841206 NA NA
Get.Along 0.0073960245 NA NA
Community.Trust 0.0046004516 NA NA
Discrimination 0.0021093322 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
234.0000000 152.0000000 1.5394737 0.3937824 0.8618421 0.5512821
prec npv misclass brier brier.norm auc
0.5550847 0.8600000 0.3264249 0.1880273 0.7521092 0.7908232
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.6752577 0.6735621 0.3937824 0.7055791 0.6822729 0.6814251
gmean
0.6892881
test_rf$importance all Yes 0
Ethnicity 9.178649e-03 NA NA
Age 8.612435e-03 NA NA
Gender 1.066123e-03 NA NA
Religion 4.104232e-04 NA NA
Employment 3.012834e-02 NA NA
Income_median 6.680711e-02 NA NA
EnglishSpeak 1.587445e-02 NA NA
EnglishDiff 5.340320e-03 NA NA
See.Family -2.620456e-03 NA NA
Close.Family 4.712252e-04 NA NA
Helpful.Family 2.130361e-03 NA NA
See.Friends -2.915042e-03 NA NA
Close.Friends -2.087536e-03 NA NA
Helpful.Friends 5.657160e-04 NA NA
Family.Respect 6.766214e-04 NA NA
Similar.Values -1.157392e-03 NA NA
Successful.Family 1.433907e-05 NA NA
Trust 4.245357e-04 NA NA
Loyalty 3.202822e-03 NA NA
Family.Pride 1.941612e-03 NA NA
Expression 3.678510e-03 NA NA
Spend.Time.Together -2.102017e-04 NA NA
Feel.Close 1.327715e-04 NA NA
Togetherness 1.130552e-03 NA NA
Religious.Attendance -6.206760e-04 NA NA
Religious.Importance 4.387678e-04 NA NA
Close.knit.Community -1.151549e-03 NA NA
Helpful.Community 1.614995e-04 NA NA
Community.Shares.Values -8.626769e-04 NA NA
Get.Along 1.932312e-04 NA NA
Community.Trust 7.894886e-04 NA NA
Discrimination -6.112475e-04 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
Physical Checkup
ps(`Physical Check-up`)# A tibble: 3 × 3
`Physical Check-up` n pct
<fct> <int> <dbl>
1 0 833 31.9
2 Yes 1740 66.7
3 <NA> 36 1.38
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Physical.Check.up ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(imb) Sample size: 2032
Frequency of class labels: 652, 1380
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 457.723
No. of variables tried at each split: 4
Total no. of variables: 11
Resampling used to grow trees: swor
Resample size used to grow trees: 1284
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 2.1166
(OOB) Brier score: 0.19394941
(OOB) Normalized Brier score: 0.77579766
(OOB) AUC: 0.72411421
(OOB) PR-AUC: 0.53617914
(OOB) G-mean: 0.66633872
(OOB) Requested performance error: 0.33366128
Confusion matrix:
predicted
observed 0 Yes class.error
0 425 227 0.3482
Yes 440 940 0.3188
(OOB) Misclassification rate: 0.328248
plot(imb,plots.one.page = F)

all 0 Yes
Health.Insurance 0.0312 NA NA
Age 0.0124 NA NA
Gender 0.0032 NA NA
Employment 0.0016 NA NA
EnglishSpeak 0.0014 NA NA
Income_median 0.0008 NA NA
Dental.Insurance -0.0013 NA NA
EnglishDiff -0.0053 NA NA
Discrimination -0.0073 NA NA
Religion -0.0124 NA NA
Ethnicity -0.0191 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1380.0000000 652.0000000 2.1165644 0.3208661 0.6518405 0.6811594
prec npv misclass brier brier.norm auc
0.4913295 0.8054841 0.3282480 0.1939494 0.7757977 0.7241142
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.5603164 0.6370455 0.3208661 0.5361791 0.6133276 0.6516921
gmean
0.6663387
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Physical.Check.up,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Physical.Check.up~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Physical.Check.up~ .,importance=T,data=train,
perf.type = "gmean",splitrule="gini")
print(rfobj) Sample size: 1626
Frequency of class labels: 800, 826
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 307.8227
No. of variables tried at each split: 4
Total no. of variables: 11
Resampling used to grow trees: swor
Resample size used to grow trees: 1028
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0325
(OOB) Brier score: 0.1428173
(OOB) Normalized Brier score: 0.57126919
(OOB) AUC: 0.88234262
(OOB) PR-AUC: 0.87509222
(OOB) G-mean: 0.80314153
(OOB) Requested performance error: 0.19685847
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 640 160 0.2000
0 160 666 0.1937
(OOB) Misclassification rate: 0.196802
print(rfobj) Sample size: 1626
Frequency of class labels: 800, 826
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 307.8227
No. of variables tried at each split: 4
Total no. of variables: 11
Resampling used to grow trees: swor
Resample size used to grow trees: 1028
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0325
(OOB) Brier score: 0.1428173
(OOB) Normalized Brier score: 0.57126919
(OOB) AUC: 0.88234262
(OOB) PR-AUC: 0.87509222
(OOB) G-mean: 0.80314153
(OOB) Requested performance error: 0.19685847
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 640 160 0.2000
0 160 666 0.1937
(OOB) Misclassification rate: 0.196802
plot(rfobj,plots.one.page = FALSE)

all Yes 0
Ethnicity 0.0597 NA NA
Religion 0.0521 NA NA
Health.Insurance 0.0520 NA NA
EnglishDiff 0.0436 NA NA
EnglishSpeak 0.0412 NA NA
Age 0.0375 NA NA
Gender 0.0258 NA NA
Employment 0.0246 NA NA
Income_median 0.0245 NA NA
Discrimination 0.0156 NA NA
Dental.Insurance 0.0153 NA NA
rfobj$importance all Yes 0
Ethnicity 0.05968643 NA NA
Age 0.03746991 NA NA
Gender 0.02577169 NA NA
Religion 0.05212527 NA NA
Employment 0.02460842 NA NA
Income_median 0.02445293 NA NA
EnglishSpeak 0.04115996 NA NA
EnglishDiff 0.04355390 NA NA
Health.Insurance 0.05199531 NA NA
Dental.Insurance 0.01530459 NA NA
Discrimination 0.01562159 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
276.0000000 130.0000000 2.1230769 0.3201970 0.8384615 0.4347826
prec npv misclass brier brier.norm auc
0.4113208 0.8510638 0.4359606 0.2073340 0.8293359 0.7284281
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.5518987 0.5634713 0.3201970 0.5663324 0.5778386 0.5836249
gmean
0.6037785
test_rf$importance all Yes 0
Ethnicity 0.014364805 NA NA
Age 0.039086387 NA NA
Gender 0.019297506 NA NA
Religion -0.010370314 NA NA
Employment 0.004825927 NA NA
Income_median 0.001628219 NA NA
EnglishSpeak 0.003351993 NA NA
EnglishDiff 0.014527465 NA NA
Health.Insurance 0.059401232 NA NA
Dental.Insurance 0.022944603 NA NA
Discrimination 0.005618560 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
Dental Checkup
ps(`Dentist Check-up`)# A tibble: 3 × 3
`Dentist Check-up` n pct
<fct> <int> <dbl>
1 0 1100 42.2
2 Yes 1462 56.0
3 <NA> 47 1.80
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Dentist Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Dentist.Check.up ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(imb) Sample size: 2026
Frequency of class labels: 840, 1186
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 486.4383
No. of variables tried at each split: 4
Total no. of variables: 11
Resampling used to grow trees: swor
Resample size used to grow trees: 1280
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.4119
(OOB) Brier score: 0.21211144
(OOB) Normalized Brier score: 0.84844575
(OOB) AUC: 0.73064021
(OOB) PR-AUC: 0.61408391
(OOB) G-mean: 0.67786705
(OOB) Requested performance error: 0.32213295
Confusion matrix:
predicted
observed 0 Yes class.error
0 578 262 0.3119
Yes 394 792 0.3322
(OOB) Misclassification rate: 0.3237907
plot(imb,plots.one.page = F)

all 0 Yes
Dental.Insurance 0.0218 NA NA
Health.Insurance 0.0085 NA NA
EnglishSpeak -0.0012 NA NA
Employment -0.0015 NA NA
Discrimination -0.0043 NA NA
Religion -0.0096 NA NA
EnglishDiff -0.0102 NA NA
Ethnicity -0.0104 NA NA
Income_median -0.0121 NA NA
Age -0.0152 NA NA
Gender -0.0167 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1186.0000000 840.0000000 1.4119048 0.4146101 0.6880952 0.6677909
prec npv misclass brier brier.norm auc
0.5946502 0.7514231 0.3237907 0.2121114 0.8484457 0.7306402
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.6379691 0.6707773 0.4146101 0.6140839 0.6579181 0.6743222
gmean
0.6778670
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Dentist.Check.up,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Dentist.Check.up~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Dentist.Check.up~ .,importance=T,data=train,
perf.type = "gmean",splitrule="gini")
print(rfobj) Sample size: 1621
Frequency of class labels: 798, 823
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 303.016
No. of variables tried at each split: 4
Total no. of variables: 11
Resampling used to grow trees: swor
Resample size used to grow trees: 1024
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0313
(OOB) Brier score: 0.14581753
(OOB) Normalized Brier score: 0.58327014
(OOB) AUC: 0.87291436
(OOB) PR-AUC: 0.86568121
(OOB) G-mean: 0.78885027
(OOB) Requested performance error: 0.21114973
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 623 175 0.2193
0 167 656 0.2029
(OOB) Misclassification rate: 0.2109809
print(rfobj) Sample size: 1621
Frequency of class labels: 798, 823
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 303.016
No. of variables tried at each split: 4
Total no. of variables: 11
Resampling used to grow trees: swor
Resample size used to grow trees: 1024
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0313
(OOB) Brier score: 0.14581753
(OOB) Normalized Brier score: 0.58327014
(OOB) AUC: 0.87291436
(OOB) PR-AUC: 0.86568121
(OOB) G-mean: 0.78885027
(OOB) Requested performance error: 0.21114973
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 623 175 0.2193
0 167 656 0.2029
(OOB) Misclassification rate: 0.2109809
plot(rfobj,plots.one.page = FALSE)

all Yes 0
Dental.Insurance 0.0442 NA NA
Religion 0.0337 NA NA
Ethnicity 0.0331 NA NA
Income_median 0.0297 NA NA
EnglishSpeak 0.0290 NA NA
EnglishDiff 0.0282 NA NA
Age 0.0210 NA NA
Gender 0.0133 NA NA
Health.Insurance 0.0127 NA NA
Employment 0.0011 NA NA
Discrimination -0.0051 NA NA
rfobj$importance all Yes 0
Ethnicity 0.033144943 NA NA
Age 0.021018090 NA NA
Gender 0.013311024 NA NA
Religion 0.033707298 NA NA
Employment 0.001147374 NA NA
Income_median 0.029733426 NA NA
EnglishSpeak 0.029024705 NA NA
EnglishDiff 0.028190525 NA NA
Health.Insurance 0.012714401 NA NA
Dental.Insurance 0.044202800 NA NA
Discrimination -0.005144584 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
237.0000000 168.0000000 1.4107143 0.4148148 0.7202381 0.6075949
prec npv misclass brier brier.norm auc
0.5654206 0.7539267 0.3456790 0.2195652 0.8782608 0.7242817
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.6335079 0.6526087 0.4148148 0.6229583 0.6475156 0.6570660
gmean
0.6615233
test_rf$importance all Yes 0
Ethnicity 0.0006639681 NA NA
Age -0.0117103332 NA NA
Gender -0.0065929427 NA NA
Religion -0.0074821532 NA NA
Employment -0.0066514213 NA NA
Income_median -0.0049266321 NA NA
EnglishSpeak 0.0000184808 NA NA
EnglishDiff -0.0070078933 NA NA
Health.Insurance 0.0060148425 NA NA
Dental.Insurance 0.0418136484 NA NA
Discrimination -0.0016691434 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
Urgent Care
ps(`Urgentcare`)# A tibble: 3 × 3
Urgentcare n pct
<fct> <int> <dbl>
1 0 2112 81.0
2 Yes 440 16.9
3 <NA> 57 2.18
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Urgentcare`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(`Urgentcare` ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(imb) Sample size: 2017
Frequency of class labels: 1673, 344
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 401.601
No. of variables tried at each split: 4
Total no. of variables: 11
Resampling used to grow trees: swor
Resample size used to grow trees: 1275
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 4.8634
(OOB) Brier score: 0.15698872
(OOB) Normalized Brier score: 0.62795486
(OOB) AUC: 0.54438396
(OOB) PR-AUC: 0.19617836
(OOB) G-mean: 0.53928632
(OOB) Requested performance error: 0.46071368
Confusion matrix:
predicted
observed 0 Yes class.error
0 951 722 0.4316
Yes 168 176 0.4884
(OOB) Misclassification rate: 0.4412494
plot(imb,plots.one.page = F)

all 0 Yes
Discrimination 0.0190 NA NA
Ethnicity 0.0077 NA NA
Dental.Insurance 0.0074 NA NA
EnglishDiff -0.0018 NA NA
Gender -0.0024 NA NA
Employment -0.0034 NA NA
Health.Insurance -0.0042 NA NA
EnglishSpeak -0.0043 NA NA
Age -0.0044 NA NA
Religion -0.0058 NA NA
Income_median -0.0066 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1673.0000000 344.0000000 4.8633721 0.1705503 0.5116279 0.5684399
prec npv misclass brier brier.norm auc
0.1959911 0.8498660 0.4412494 0.1569887 0.6279549 0.5443840
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.2834138 0.4002932 0.1705503 0.1961784 0.4113501 0.4697898
gmean
0.5392863
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Urgentcare,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Urgentcare~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Urgentcare~ .,importance=T,data=train,
perf.type = "gmean",splitrule="gini")
print(rfobj) Sample size: 1615
Frequency of class labels: 793, 822
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 339.0257
No. of variables tried at each split: 4
Total no. of variables: 11
Resampling used to grow trees: swor
Resample size used to grow trees: 1021
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0366
(OOB) Brier score: 0.15016519
(OOB) Normalized Brier score: 0.60066076
(OOB) AUC: 0.87896681
(OOB) PR-AUC: 0.85969945
(OOB) G-mean: 0.8098881
(OOB) Requested performance error: 0.1901119
Confusion matrix:
predicted
observed 0 Yes class.error
0 626 167 0.2106
Yes 139 683 0.1691
(OOB) Misclassification rate: 0.1894737
print(rfobj) Sample size: 1615
Frequency of class labels: 793, 822
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 339.0257
No. of variables tried at each split: 4
Total no. of variables: 11
Resampling used to grow trees: swor
Resample size used to grow trees: 1021
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0366
(OOB) Brier score: 0.15016519
(OOB) Normalized Brier score: 0.60066076
(OOB) AUC: 0.87896681
(OOB) PR-AUC: 0.85969945
(OOB) G-mean: 0.8098881
(OOB) Requested performance error: 0.1901119
Confusion matrix:
predicted
observed 0 Yes class.error
0 626 167 0.2106
Yes 139 683 0.1691
(OOB) Misclassification rate: 0.1894737
plot(rfobj,plots.one.page = FALSE)

all 0 Yes
Religion 0.0714 NA NA
Ethnicity 0.0565 NA NA
EnglishDiff 0.0558 NA NA
EnglishSpeak 0.0378 NA NA
Age 0.0365 NA NA
Dental.Insurance 0.0292 NA NA
Discrimination 0.0290 NA NA
Gender 0.0259 NA NA
Income_median 0.0210 NA NA
Health.Insurance 0.0125 NA NA
Employment 0.0107 NA NA
rfobj$importance all 0 Yes
Ethnicity 0.05649963 NA NA
Age 0.03654373 NA NA
Gender 0.02594001 NA NA
Religion 0.07144963 NA NA
Employment 0.01074328 NA NA
Income_median 0.02095404 NA NA
EnglishSpeak 0.03780840 NA NA
EnglishDiff 0.05581575 NA NA
Health.Insurance 0.01245267 NA NA
Dental.Insurance 0.02916377 NA NA
Discrimination 0.02901248 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
334.0000000 68.0000000 4.9117647 0.1691542 0.9852941 0.1197605
prec npv misclass brier brier.norm auc
0.1855956 0.9756098 0.7338308 0.2004005 0.8016018 0.6097878
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.3123543 0.2535178 0.1691542 0.2236047 0.3279323 0.2985140
gmean
0.3435103
test_rf$importance all 0 Yes
Ethnicity 0.0556660407 NA NA
Age 0.0011158844 NA NA
Gender -0.0003034483 NA NA
Religion 0.0345407523 NA NA
Employment -0.0098904155 NA NA
Income_median 0.0063793389 NA NA
EnglishSpeak 0.0199196678 NA NA
EnglishDiff 0.0498507656 NA NA
Health.Insurance 0.0001078272 NA NA
Dental.Insurance 0.0104565421 NA NA
Discrimination 0.0075008564 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
Folk Medicine
ps(`Folkmedicine`)# A tibble: 3 × 3
Folkmedicine n pct
<fct> <int> <dbl>
1 0 2189 83.9
2 Yes 348 13.3
3 <NA> 72 2.76
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Folkmedicine ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(imb) Sample size: 2012
Frequency of class labels: 1735, 277
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 323.723
No. of variables tried at each split: 4
Total no. of variables: 11
Resampling used to grow trees: swor
Resample size used to grow trees: 1272
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 6.2635
(OOB) Brier score: 0.11478476
(OOB) Normalized Brier score: 0.45913903
(OOB) AUC: 0.68994476
(OOB) PR-AUC: 0.26393605
(OOB) G-mean: 0.65278483
(OOB) Requested performance error: 0.34721517
Confusion matrix:
predicted
observed 0 Yes class.error
0 1107 628 0.3620
Yes 92 185 0.3321
(OOB) Misclassification rate: 0.3578529
plot(imb,plots.one.page = F)

all 0 Yes
Age 0.0456 NA NA
Ethnicity 0.0447 NA NA
EnglishSpeak 0.0345 NA NA
Employment 0.0166 NA NA
Discrimination 0.0141 NA NA
Income_median 0.0133 NA NA
EnglishDiff 0.0100 NA NA
Gender 0.0060 NA NA
Religion 0.0014 NA NA
Health.Insurance 0.0008 NA NA
Dental.Insurance -0.0015 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1735.0000000 277.0000000 6.2635379 0.1376740 0.6678700 0.6380403
prec npv misclass brier brier.norm auc
0.2275523 0.9232694 0.3578529 0.1147848 0.4591390 0.6899448
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.3394495 0.4682581 0.1376740 0.2639361 0.4961172 0.5605214
gmean
0.6527848
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Folkmedicine,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Folkmedicine~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(`Folkmedicine` ~ .,importance=T,data=train,
perf.type = "gmean",splitrule="gini")
print(rfobj) Sample size: 1610
Frequency of class labels: 791, 819
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 303.209
No. of variables tried at each split: 4
Total no. of variables: 11
Resampling used to grow trees: swor
Resample size used to grow trees: 1018
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0354
(OOB) Brier score: 0.12083947
(OOB) Normalized Brier score: 0.48335789
(OOB) AUC: 0.92561154
(OOB) PR-AUC: 0.9239028
(OOB) G-mean: 0.84214075
(OOB) Requested performance error: 0.15785925
Confusion matrix:
predicted
observed 0 Yes class.error
0 639 152 0.1922
Yes 100 719 0.1221
(OOB) Misclassification rate: 0.1565217
print(rfobj) Sample size: 1610
Frequency of class labels: 791, 819
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 303.209
No. of variables tried at each split: 4
Total no. of variables: 11
Resampling used to grow trees: swor
Resample size used to grow trees: 1018
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0354
(OOB) Brier score: 0.12083947
(OOB) Normalized Brier score: 0.48335789
(OOB) AUC: 0.92561154
(OOB) PR-AUC: 0.9239028
(OOB) G-mean: 0.84214075
(OOB) Requested performance error: 0.15785925
Confusion matrix:
predicted
observed 0 Yes class.error
0 639 152 0.1922
Yes 100 719 0.1221
(OOB) Misclassification rate: 0.1565217
plot(rfobj,plots.one.page = FALSE)

all 0 Yes
Religion 0.0758 NA NA
Ethnicity 0.0732 NA NA
EnglishDiff 0.0558 NA NA
EnglishSpeak 0.0460 NA NA
Dental.Insurance 0.0311 NA NA
Age 0.0266 NA NA
Gender 0.0237 NA NA
Employment 0.0167 NA NA
Income_median 0.0125 NA NA
Health.Insurance 0.0097 NA NA
Discrimination 0.0091 NA NA
rfobj$importance all 0 Yes
Ethnicity 0.073217837 NA NA
Age 0.026637303 NA NA
Gender 0.023723749 NA NA
Religion 0.075761540 NA NA
Employment 0.016746624 NA NA
Income_median 0.012502924 NA NA
EnglishSpeak 0.046027137 NA NA
EnglishDiff 0.055805156 NA NA
Health.Insurance 0.009674993 NA NA
Dental.Insurance 0.031051997 NA NA
Discrimination 0.009053117 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance = T)
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
347.0000000 55.0000000 6.3090909 0.1368159 0.9272727 0.2017291
prec npv misclass brier brier.norm auc
0.1554878 0.9459459 0.6990050 0.1844695 0.7378781 0.5884202
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.2663185 0.2957685 0.1368159 0.1881303 0.3494102 0.3641352
gmean
0.4325019
test_rf$importance all 0 Yes
Ethnicity 0.0238138463 NA NA
Age -0.0017784139 NA NA
Gender -0.0002290971 NA NA
Religion -0.0149806420 NA NA
Employment 0.0139455879 NA NA
Income_median 0.0059245342 NA NA
EnglishSpeak 0.0110700102 NA NA
EnglishDiff 0.0092463758 NA NA
Health.Insurance 0.0055529282 NA NA
Dental.Insurance -0.0191544039 NA NA
Discrimination -0.0149694157 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot